import pandas as pd
import numpy as np
import os
import pickle
import random
# data visualization
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode , plot
from plotly.graph_objs import *
init_notebook_mode()
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import plot
import missingno as msno
from wordcloud import WordCloud
import seaborn as sns
import squarify
# tensorflow
import tensorflow as tf
# transformers
from transformers import BertTokenizer
from transformers import TFBertForSequenceClassification
# nltk
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import string
import re
# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , precision_score , recall_score , f1_score , roc_auc_score
df = pd.read_csv('./Womens Clothing E-Commerce Reviews.csv')
def missing_values_table(data):
missing_value = data.isnull().sum()
missing_value_percent = data.isnull().sum() / data.shape[0] * 100
missing_value_table = pd.concat([missing_value , missing_value_percent] , axis = 1)
missing_value_table = missing_value_table.rename(columns = {0 : 'Missing Values' , 1 : '% of Total Values'})
missing_value_table.sort_values('% of Total Values' , ascending = False , inplace = True)
print('Total Features:{}; There are {} features have missing values'.format(len(missing_value_table) , (missing_value_table['% of Total Values'] != 0).sum()))
return missing_value_table.loc[missing_value_table['% of Total Values'] != 0]
missing_values_table(df)
print('Shape of DataFrame(before preprocess) : {}'.format(df.shape))
feature_nan = df.isnull().sum() / df.shape[0]
msno.matrix(df).set_title('Distribution of Missing Value (before)' , fontsize = 30 , fontstyle= 'oblique')
Total Features:11; There are 5 features have missing values Shape of DataFrame(before preprocess) : (23486, 11)
Text(0.5, 1.0, 'Distribution of Missing Value (before)')
df.drop(['Unnamed: 0' , 'Clothing ID'] , axis = 1 , inplace = True)
df.dropna(subset = ['Review Text' , 'Division Name' , 'Department Name' , 'Class Name' , 'Title'] , inplace = True)
df['Text'] = df['Title'] + ' ' + df['Review Text']
df.drop(['Title' , 'Review Text' , 'Division Name'] , axis = 1 , inplace = True)
df = df.reset_index().drop('index' , axis = 1)
df['Text_Length'] = df['Text'].apply(len)
print('Shape of DataFrame(after preprocess) : {}'.format(df.shape))
Shape of DataFrame(after preprocess) : (19662, 8)
msno.matrix(df).set_title('Distribution of Missing Value (after)' , fontsize = 30 , fontstyle= 'oblique')
feature_nan = df.isnull().sum() / df.shape[0]
df.info() # we got 8 columns: 5 numeric columns & 3 categorical columns.
<class 'pandas.core.frame.DataFrame'> RangeIndex: 19662 entries, 0 to 19661 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 19662 non-null int64 1 Rating 19662 non-null int64 2 Recommended IND 19662 non-null int64 3 Positive Feedback Count 19662 non-null int64 4 Department Name 19662 non-null object 5 Class Name 19662 non-null object 6 Text 19662 non-null object 7 Text_Length 19662 non-null int64 dtypes: int64(5), object(3) memory usage: 1.2+ MB
def status(x) :
return pd.Series([x.count() , x.min() , x.idxmin() , x.quantile(.25) , x.median(), x.quantile(0.75) ,
x.mean() , x.max() , x.idxmax() , x.mad() , x.var(), x.std() , x.skew() , x.kurt()] ,
index=['count' , 'min' , 'min_index' , '25%' , '50%' , '75%' , 'mean' , 'max' , 'max_index' , 'mad' , 'var' , 'std' , 'skew' , 'kurt'])
object_features = []
for col in df.columns:
if df[col].dtypes == 'object':
object_features.append(col)
number_feature = list(set(list(df.columns)).difference(set(object_features)))
number_describe = df[number_feature].apply(status)
object_describe = df[object_features].describe()
number_describe.T.style.background_gradient(low = 0.2 , high = 0.5 , cmap = 'rocket_r')
| count | min | min_index | 25% | 50% | 75% | mean | max | max_index | mad | var | std | skew | kurt | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Positive Feedback Count | 19662.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 3.000000 | 2.652477 | 122.000000 | 6050.000000 | 3.125654 | 34.038878 | 5.834285 | 6.339509 | 67.512044 |
| Rating | 19662.000000 | 1.000000 | 47.000000 | 4.000000 | 5.000000 | 5.000000 | 4.183145 | 5.000000 | 1.000000 | 0.902188 | 1.237043 | 1.112224 | -1.280191 | 0.706729 |
| Text_Length | 19662.000000 | 23.000000 | 11211.000000 | 217.000000 | 336.000000 | 496.000000 | 338.543739 | 557.000000 | 11265.000000 | 126.677905 | 20945.222923 | 144.724645 | -0.113424 | -1.270739 |
| Age | 19662.000000 | 18.000000 | 7417.000000 | 34.000000 | 41.000000 | 52.000000 | 43.260808 | 99.000000 | 6967.000000 | 9.999603 | 150.261562 | 12.258122 | 0.515632 | -0.139725 |
| Recommended IND | 19662.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 0.818177 | 1.000000 | 1.000000 | 0.297527 | 0.148771 | 0.385708 | -1.650001 | 0.722578 |
object_describe.T
| count | unique | top | freq | |
|---|---|---|---|---|
| Department Name | 19662 | 6 | Tops | 8713 |
| Class Name | 19662 | 20 | Dresses | 5371 |
| Text | 19662 | 19658 | Love retailer bathing suits! Perfect fit and i... | 3 |
# 1. distribution plot
hist_data = [df['Age']]
group_labels = ['Age Distribution']
fig1 = ff.create_distplot(hist_data , group_labels , show_hist = True , show_rug=False , colors = ['#b2182b' , '#2166ac'])
fig1.update_layout(xaxis_title = 'Age',
title = 'Age Distribution' ,
font = dict(size=17, family = 'Franklin Gothic') ,
template = 'simple_white')
fig1.show()
# 2. bar plot
age = df['Age'].value_counts().sort_index()
age = pd.DataFrame(age)
age['count'] = age['Age']
age.drop(['Age'] , axis = 1 , inplace = True)
age['age'] = age.index
age = age.reset_index(drop = True)
for i in range(0 , 10):
age.loc[age['age'].between(i * 10 , i * 10 + 9 , 'left') , 'age_group'] = '{}-{}'.format(i * 10 , i * 10 + 9)
age_df = age.groupby('age_group').apply(lambda x : sum(x['count'])).reset_index()
age_df = age_df.sort_values(by = 'age_group' , ascending = True)
age_df = age_df.rename({0: 'count'} , axis=1)
fig2 = px.bar(x = age_df['age_group'],
y = age_df['count'],
text = age_df['count'],
color = age_df['age_group'],
color_discrete_sequence = px.colors.sequential.RdBu,
template = 'simple_white',
title = 'Age Bar Plot')
# Age of customer mostly distributed in the range of 30 to 59
fig2.update_traces(width = 0.7 , marker = dict(line = dict(color = '#000000' , width = 2)))
fig2.update_layout(xaxis_title = 'Age Range',
yaxis_title = 'count',
font = dict(size = 17 , family= ' Franklin Gothic'))
fig2.show()
rating = df['Rating'].value_counts()
rating_pie = go.Pie(values = rating.values,
labels = rating.index,
marker = dict(colors = ['#fddbc7' , '#f4a582' , '#d6604d' , '#b2182b' , '#67001f']) ,
title = 'Rating' ,
titlefont = dict(size = 17))
layout = go.Layout(title = 'Rating' , font = dict(size = 17 , family= ' Franklin Gothic'))
fig1 = go.Figure(data = [rating_pie] , layout = layout)
fig1.show()
recommended = df['Recommended IND'].value_counts()
recommended_trace = go.Pie(values = recommended.values,
labels = recommended.index,
marker = dict(colors = ['#fddbc7' , '#f4a582']) ,
title = 'Recommended IND' ,
titlefont = dict(size = 17))
layout = go.Layout(title = 'Recommended IND' , font = dict(size = 17 , family= ' Franklin Gothic'))
fig2 = go.Figure(data = [recommended_trace] , layout = layout)
fig2.show()
hist_data = [df['Positive Feedback Count']]
group_labels = ['Positive Feedback Count']
# 1. distribution plot
fig1 = ff.create_distplot(hist_data , group_labels ,show_hist = True , show_rug=False , colors = ['#b2182b'])
fig1.update_layout(xaxis_title = 'Positive Feedback Count',
title = 'Positive Feedback Count Distribution' ,
font = dict(size = 17, family = 'Franklin Gothic') ,
template = 'simple_white')
fig1.show()
# 2. box plot
fig2 = px.box(df['Positive Feedback Count'] ,
color_discrete_sequence = px.colors.sequential.RdBu ,
orientation = 'h')
fig2.update_layout(title = 'Positive Feedback Count (Box Plot)',
font = dict(size = 17, family = 'Franklin Gothic') ,
template = 'simple_white')
fig2.show()
recommended_p = df.loc[df['Recommended IND'] == 1]
recommended_n = df.loc[df['Recommended IND'] == 0]
hist_data = [recommended_p['Text_Length'] , recommended_n['Text_Length']]
group_labels = ['Text of Recommended Comments' , 'Text of Unrecommended Comments']
fig = ff.create_distplot(hist_data , group_labels , show_hist = True , show_rug=False , colors = ['#b2182b' , '#2166ac'])
fig.update_layout(xaxis_title = 'Text Length',
title = 'Text Length by Recommended IND' ,
font = dict(size = 17 , family = 'Franklin Gothic') ,
template = 'simple_white')
fig.show()
fig = px.treemap(df ,
path = ['Department Name' , 'Class Name'],
title = ' Tree Map: Department & Class Name',
color_discrete_sequence = px.colors.sequential.RdBu,
width = 1200 ,
height = 600)
fig.update_traces(textinfo = 'label+value' ,
textfont_size = 13,
marker = dict(line = dict(color = 'white' , width = 0.2)))
fig.update_layout(font = dict(size = 17 , family = 'Franklin Gothic'))
fig.show()
department_count = df['Department Name'].value_counts()
department_count = pd.DataFrame(department_count)
department_count = department_count.rename({'Department Name' : 'count'} , axis = 1)
department_count['Department Name'] = department_count.index
department_count = department_count.reset_index(drop = True)
fig = px.bar(x = department_count['Department Name'],
y = department_count['count'],
text = department_count['count'],
color = department_count['Department Name'],
color_discrete_sequence = px.colors.sequential.RdBu,
template = 'simple_white',
title = 'Department Bar Plot')
fig.update_traces(width = 0.7 , marker = dict(line = dict(color = '#000000' , width = 2)))
fig.update_layout(xaxis_title = 'Department',
yaxis_title='count',
font = dict(size=17 , family = 'Franklin Gothic'))
fig.show()
class_count = df['Class Name'].value_counts()
class_count = pd.DataFrame(class_count)
class_count = class_count.rename({'Class Name' : 'count'} , axis = 1)
class_count['Class Name'] = class_count.index
class_count = class_count.reset_index(drop = True)
fig = px.bar(x = class_count['Class Name'],
y = class_count['count'],
text = class_count['count'],
color = class_count['Class Name'],
color_discrete_sequence = px.colors.sequential.RdBu,
template = 'simple_white',
title = 'Class Bar Plot')
fig.update_traces(width = 0.7 , marker = dict(line = dict(color = '#000000' , width = 2)))
fig.update_layout(xaxis_title = 'Class',
yaxis_title='count',
font = dict(size=17 , family = 'Franklin Gothic'))
fig.show()
# get all of strings from sentences
def get_all_str(sentences):
sentence = ''
for words in sentences:
sentence += words
sentence = sentence.lower()
return sentence
# get word from text
def get_word(text):
text = BeautifulSoup(text , 'html.parser').get_text()
text = re.sub('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]' , ' ', text)
text = re.sub('[^a-zA-Z]' , ' ' , text)
result = nltk.RegexpTokenizer(r'\w+').tokenize(text.lower())
return result
# remove stopwords from list
def remove_stopword(lst):
stopword = set(stopwords.words('english'))
cleanwordlist = [word for word in lst if word not in stopword]
return cleanwordlist
# lemmatize
def lemmatization(lst):
lemm = WordNetLemmatizer()
tokens = [lemm.lemmatize(word) for word in lst]
return tokens
def pos_adjective(word_tagged):
only_adjective = [word for word , pos in word_tagged if pos in ['JJ' , 'JJR' , 'JJS']]
return only_adjective
def freq_df(lst):
freq_dist_nltk = nltk.FreqDist(lst)
df_freq = pd.DataFrame.from_dict(freq_dist_nltk , orient = 'index')
df_freq = pd.DataFrame(df_freq)
df_freq['Term'] = df_freq.index
df_freq = df_freq.rename({0 : 'Frequency'} , axis = 1)
df_freq = df_freq.sort_values(by = ['Frequency'] , ascending = False)
df_freq = df_freq.reset_index(drop = True)
return df_freq
def preprocess(column):
all_str = get_all_str(column)
words = get_word(all_str)
words_remove_stopword = remove_stopword(words)
words_lemmatize = lemmatization(words_remove_stopword)
frequency_df = freq_df(words_lemmatize)
return frequency_df
# function to plot word cloud of words
def Word_Cloud(data , color_background , colormap , title):
plt.figure(figsize = (20 , 15))
wc = WordCloud(width = 800 ,
height = 400 ,
max_words = 100 ,
colormap = colormap ,
max_font_size = 200 ,
min_font_size = 1 ,
random_state = 0 ,
background_color = color_background).generate_from_frequencies(data)
plt.imshow(wc , interpolation = 'bilinear')
plt.title(title , fontsize = 20)
plt.axis('off')
plt.show()
def create_wordcloud(frequency_df , title , color):
data = frequency_df.set_index('Term').to_dict()['Frequency']
Word_Cloud(data , 'white' , color , title)
# function to plot word treemap of words
def wordcount_treemap(data , word_num , title):
fig , ax = plt.subplots(1 , figsize = (12 , 12))
word_num = 100
squarify.plot(sizes = list(data.values())[:word_num] ,
label = list(data.keys())[:word_num] ,
value = list(data.values())[:word_num] ,
text_kwargs = {'fontsize': 13 , 'color': 'white'} ,
color = sns.color_palette('hls' , 8) ,
ec = 'white')
plt.title(title , fontsize = 20)
plt.axis('off')
plt.show()
def create_wordcount_treemap(frequency_df , title , word_num):
data = frequency_df.set_index('Term').to_dict()['Frequency']
wordcount_treemap(data , word_num , title)
top10_frequent = preprocess(df['Text'])[:10]
# bar plot
fig = px.bar(top10_frequent ,
x = 'Term' ,
y = 'Frequency' ,
text = 'Frequency' ,
color_discrete_sequence = px.colors.sequential.RdBu ,
title = 'Top 10 frequent terms (bar plot)',
color = 'Term' ,
template = 'simple_white')
fig.update_traces(width = 0.7 , marker = dict(line = dict(color = '#000000' , width = 2)))
fig.update_layout(font = dict(size = 17 , family = 'Franklin Gothic'))
fig.show()
frequency_df = preprocess(df['Text'])
create_wordcount_treemap(frequency_df , 'Text of Comments (Word Treemap)' , 100)
for i in range(0 , 10):
df.loc[df['Age'].between(i * 10 , i * 10 + 9 , 'left') , 'age_group'] = '{}-{}'.format(i * 10 , i * 10 + 9)
# 10-19 : bottom(%) , dresses(%) , Intimate(%) , Jackets(%) , Tops(%) , Trend(%)
# 20-29 : bottom(%) , dresses(%) , Intimate(%) , Jackets(%) , Tops(%) , Trend(%)
# ...
# 90-99 : bottom(%) , dresses(%) , Intimate(%) , Jackets(%) , Tops(%) , Trend(%)
age_groups = sorted(df['age_group'].unique())
department_groups = sorted(df['Department Name'].unique())
for i , age_group in enumerate(age_groups):
age_group_df = df.loc[df['age_group'] == age_group]
age_department_df = age_group_df.groupby(['Department Name']).apply(lambda x : len(x) / len(age_group_df))
age_department_df = pd.DataFrame(age_department_df)
age_department_df['Department Name'] = age_department_df.index
age_department_df = age_department_df.reset_index(drop = True)
age_department_df = pd.merge(left = age_group_df , right = age_department_df , on = 'Department Name' , how = 'left')
age_department_df = age_department_df.rename({0 : 'percentage'} , axis = 1)
age_department_df = age_department_df[['Department Name' , 'age_group' , 'percentage']]
age_department_df = age_department_df.drop_duplicates(subset = 'Department Name').reset_index(drop = True)
if i == 0:
age_department_df_total = age_department_df
else:
age_department_df_total = pd.concat([age_department_df_total , age_department_df] , axis = 0)
age_department_df_total = age_department_df_total.reset_index(drop = True)
def func(x):
x['scaled_percentage'] = x['percentage'] / sum(x['percentage'])
return x
age_department_df_total = age_department_df_total.groupby('Department Name').apply(lambda x: func(x))
department_age_df_list = []
for age in age_groups:
department_age_df = age_department_df_total.loc[age_department_df_total['age_group'] == age].sort_values(by = 'age_group' , ascending = True)
department_age_df_list.append(department_age_df)
marker_color = ['#67001f','#b2182b','#d6604d','#f4a582','#fddbc7','#d1e5f0','#92c5de','#4393c3','#2166ac']
data = []
for i in range(0 , len(age_groups) - 2):
data.append(go.Bar(name = age_groups[i] ,
x = department_age_df_list[i]['Department Name'] ,
y = department_age_df_list[i]['scaled_percentage'] ,
text = np.round(department_age_df_list[i]['percentage'] , 2) ,
marker_color = marker_color[i]))
fig = go.Figure(data = data)
fig.update_traces(marker = dict(line = dict(color = '#000000' , width = 2)))
fig.update_layout(title ='Department by Age',
xaxis_title = 'Department',
yaxis_title = 'Aget Percentage',
font = dict(size = 17 , family = 'Franklin Gothic') ,
template = "simple_white")
fig.show()
# the percentage of Ration 1 ~ Rating 5 in a department
# Jacket and bottoms both have high ratings, with few low ratings, while the trends have few high ratings and many low ratings.
# bottoms : Rating 1(%) , Rating 2(%) , Rating 3(%) , Rating 4(%) , Rating 5(%) , Rating 6(%)
# dresses : Rating 1(%) , Rating 2(%) , Rating 3(%) , Rating 4(%) , Rating 5(%) , Rating 6(%)
# ...
# Trend : Rating 1(%) , Rating 2(%) , Rating 3(%) , Rating 4(%) , Rating 5(%) , Rating 6(%)
# Jacket and bottoms both have high ratings, with few low ratings, while the trends have few high ratings and many low ratings.
rating_groups = sorted(df['Rating'].unique())
department_groups = sorted(df['Department Name'].unique())
for i , department_group in enumerate(department_groups):
department_group_df = df.loc[df['Department Name'] == department_group]
rating_department_df = department_group_df.groupby(['Rating']).apply(lambda x : len(x) / len(department_group_df))
rating_department_df = pd.DataFrame(rating_department_df)
rating_department_df['Rating'] = rating_department_df.index
rating_department_df = rating_department_df.reset_index(drop = True)
rating_department_df = pd.merge(left = department_group_df , right = rating_department_df , on = 'Rating' , how = 'left')
rating_department_df = rating_department_df.rename({0 : 'percentage'} , axis = 1)
rating_department_df =rating_department_df[['Department Name' , 'Rating' , 'percentage']]
rating_department_df = rating_department_df.drop_duplicates(subset = 'Rating').reset_index(drop = True)
if i == 0:
rating_department_df_total = rating_department_df
else:
rating_department_df_total = pd.concat([rating_department_df_total , rating_department_df] , axis = 0)
department_rating_df_list = []
for rating in rating_groups:
department_rating_df = rating_department_df_total.loc[rating_department_df_total['Rating'] == rating]
department_rating_df['Rating'] = department_rating_df['Rating'].astype(str)
department_rating_df_list.append(department_rating_df)
marker_color = ['#67001f','#b2182b','#d6604d','#f4a582','#fddbc7','#d1e5f0','#92c5de','#4393c3','#2166ac']
data = []
for i in range(0 , len(rating_groups)):
data.append(go.Bar(name = str(rating_groups[i]) ,
x = department_rating_df_list[i]['Department Name'] ,
y = department_rating_df_list[i]['percentage'] ,
text = np.round(department_rating_df_list[i]['percentage'] , 2) ,
marker_color = marker_color[i]))
fig = go.Figure(data = data)
fig.update_traces(marker = dict(line = dict(color = '#000000' , width = 2)))
fig.update_layout(title ='Department by Rating',
xaxis_title = 'Department',
yaxis_title = 'Rating Percentage',
font = dict(size = 17 , family = 'Franklin Gothic') ,
template = "simple_white")
fig.show()
# 1 : Rating 1(%) , Rating 2(%) , Rating 3(%) , Rating 4(%) , Rating 5(%) , Rating 6(%)
# 0 : Rating 1(%) , Rating 2(%) , Rating 3(%) , Rating 4(%) , Rating 5(%) , Rating 6(%)
recommend_groups = sorted(df['Recommended IND'].unique())
rating_groups = sorted(df['Rating'].unique())
for i , recommend_group in enumerate(recommend_groups):
recommend_group_df = df.loc[df['Recommended IND'] == recommend_group]
rating_recommend_df = recommend_group_df.groupby(['Rating']).apply(lambda x : len(x) / len(recommend_group_df))
rating_recommend_df = pd.DataFrame(rating_recommend_df)
rating_recommend_df['Rating'] = rating_recommend_df.index
rating_recommend_df = rating_recommend_df.reset_index(drop = True)
rating_recommend_df = pd.merge(left = recommend_group_df , right = rating_recommend_df , on = 'Rating' , how = 'left')
rating_recommend_df = rating_recommend_df.rename({0 : 'percentage'} , axis = 1)
rating_recommend_df = rating_recommend_df[['Recommended IND' , 'Rating' , 'percentage']]
rating_recommend_df = rating_recommend_df.drop_duplicates(subset = 'Rating').reset_index(drop = True)
if i == 0:
rating_recommend_df_total = rating_recommend_df
else:
rating_recommend_df_total = pd.concat([rating_recommend_df_total , rating_recommend_df] , axis = 0)
recommend_rating_df_list = []
for rating in rating_groups:
recommend_rating_df = rating_recommend_df_total.loc[rating_recommend_df_total['Rating'] == rating]
recommend_rating_df['Rating'] = recommend_rating_df['Rating'].astype(str)
recommend_rating_df_list.append(recommend_rating_df)
marker_color = ['#67001f','#b2182b','#d6604d','#f4a582','#fddbc7','#d1e5f0','#92c5de','#4393c3','#2166ac']
data = []
for i in range(0 , len(rating_groups)):
data.append(go.Bar(name = str(rating_groups[i]) ,
x = recommend_rating_df_list[i]['Recommended IND'] ,
y = recommend_rating_df_list[i]['percentage'] ,
text = np.round(recommend_rating_df_list[i]['percentage'] , 2) ,
marker_color = marker_color[i]))
fig = go.Figure(data = data)
fig.update_traces(marker = dict(line = dict(color = '#000000' , width = 2)))
fig.update_layout(title ='Recommended IND by Rating',
xaxis_title = 'Recommended IND',
yaxis_title = 'Rating Percentage',
font = dict(size = 17 , family = 'Franklin Gothic') ,
template = "simple_white")
fig.show()
def preprocess_adj(column):
all_str = get_all_str(column)
words = get_word(all_str)
words_remove_stopword = remove_stopword(words)
words_lemmatize = lemmatization(words_remove_stopword)
word_tagged = nltk.pos_tag(words_lemmatize)
words_adj = pos_adjective(word_tagged)
frequency_df = freq_df(words_adj)
return frequency_df
recommended_p = df[df['Recommended IND'] == 1]
df_freq_adj_p = preprocess_adj(recommended_p['Text'])
top10_frequent_p = df_freq_adj_p[:10]
# bar plot
fig = px.bar(top10_frequent_p ,
x = 'Term' ,
y = 'Frequency' ,
text = 'Frequency' ,
color_discrete_sequence = px.colors.sequential.RdBu ,
title = 'Top 10 frequent recommended terms (bar plot)',
color = 'Term' ,
template = 'simple_white')
fig.update_traces(width = 0.7 , marker = dict(line = dict(color = '#000000' , width = 2)))
fig.update_layout(font = dict(size = 17 , family = 'Franklin Gothic'))
fig.show()
def preprocess(column):
all_str = get_all_str(column)
words = get_word(all_str)
words_remove_stopword = remove_stopword(words)
words_lemmatize = lemmatization(words_remove_stopword)
return words_lemmatize
df.to_csv('df_no_missing.csv')
df = pd.read_csv('./df_no_missing.csv')
df['Text'] = df.apply(lambda x : preprocess(x['Text']) , axis = 1)
df.to_csv('df_cleaning.csv')
df = pd.read_csv('./df_cleaning.csv')
y = df['Recommended IND']
X = df.drop('Recommended IND' , axis = 1)
X['Text'] = X['Text'].apply(lambda x : eval(x))
# hyperparameter
batch_size = 128
epochs = 10
# split by ratio of 0.3
X_train , X_test , y_train , y_test = train_test_split(X['Text'] , y , test_size = 0.3 , random_state = 0)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
seq_max_len = max(X['Text'].apply(lambda x : len(x)))
X_train_text = tokenizer(list(X_train) ,
is_split_into_words = True ,
padding = True ,
truncation=True ,
max_length = seq_max_len ,
return_tensors = "tf")
X_test_text = tokenizer(list(X_test) ,
is_split_into_words = True ,
padding = True ,
truncation = True ,
max_length = seq_max_len ,
return_tensors = "tf")
y_train , y_test = np.array(y_train.astype(np.int32)) , np.array(y_test.astype(np.int32))
2023-11-27 17:18:17.539009: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-11-27 17:18:17.562233: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-11-27 17:18:17.562383: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-11-27 17:18:17.562811: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2023-11-27 17:18:17.563371: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-11-27 17:18:17.563493: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-11-27 17:18:17.563604: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-11-27 17:18:17.980996: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-11-27 17:18:17.981162: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-11-27 17:18:17.981278: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-11-27 17:18:17.981370: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9297 MB memory: -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased' , num_labels = 2)
optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-5)
All PyTorch model weights were used when initializing TFBertForSequenceClassification. Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
def minibatch_index(minibatch_num , len_data , training = True):
total_index = list(range(0 , len_data))
if training == True:
random.shuffle(total_index)
step = (int(len_data / minibatch_num) + 1) * minibatch_num
temp = []
index = []
j = 0
for ii in range(0 , step):
j = j + 1
if j > len_data:
j = j - (len_data)
temp.append([total_index[j - 1]])
if len(temp) == minibatch_num:
index.append(temp)
temp = []
return index
train_index_list = minibatch_index(minibatch_num = batch_size , len_data = len(X_train))
test_index_list = minibatch_index(minibatch_num = 1 , len_data = len(X_test) , training = False)
@tf.function
def train_step(x , y):
with tf.GradientTape() as tape:
bert_output = bert_model(x)
prediction = tf.nn.softmax(bert_output.logits , axis = -1)
y_pred = prediction[: , 1]
y_pred = tf.reshape(y_pred , [-1 , 1])
prediction = tf.math.log(tf.clip_by_value(prediction , 1e-8 , tf.reduce_max(prediction)))
y_onehot = tf.one_hot(y , depth = 2)
cross_entropy_temp = -tf.reduce_sum(y_onehot * prediction , axis = 1)
cross_entropy = tf.reduce_mean(cross_entropy_temp)
correct = tf.equal(tf.math.argmax(prediction , 1) , tf.argmax(y_onehot , 1))
correct = tf.cast(correct , tf.float32)
accuracy = tf.reduce_mean(correct)
var_list = [var for var in bert_model.trainable_variables]
grads = tape.gradient(cross_entropy , var_list)
optimizer.apply_gradients(grads_and_vars = zip(grads , var_list))
return cross_entropy , accuracy , y_pred
@tf.function
def test_step(x , y):
bert_output = bert_model(x)
prediction = tf.nn.softmax(bert_output.logits , axis = -1)
y_pred = prediction[: , 1]
y_pred = tf.reshape(y_pred , [-1 , 1])
prediction = tf.math.log(tf.clip_by_value(prediction , 1e-8 , tf.reduce_max(prediction)))
y_onehot = tf.one_hot(tf.reshape(y , [-1 , ]) , depth = 2)
cross_entropy_temp = -tf.reduce_sum(y_onehot * prediction , axis = 1)
cross_entropy = tf.reduce_mean(cross_entropy_temp)
return cross_entropy , y_pred
auc_acc_loss_history = {'train':{'loss':{} , 'acc':{} , 'auc':{}} , 'test' :{'loss':{} , 'acc':{} , 'auc':{}}}
for epoch_i in range(0 , epochs):
# trining
loss_train = 0
accuracy_train = 0
auc_train = 0
for batch_i , index in enumerate(train_index_list):
x_train_batch = {'input_ids': tf.gather_nd(X_train_text['input_ids'] , indices = index) ,
'attention_mask': tf.gather_nd(X_train_text['attention_mask'] , indices = index)}
y_train_batch = y_train[np.squeeze(np.array(index))]
loss_train_batch , acc_train_batch , y_pred_train_batch = train_step(x_train_batch , y_train_batch)
loss_train += loss_train_batch.numpy()
accuracy_train += acc_train_batch.numpy()
auc_train += roc_auc_score(y_train_batch , y_pred_train_batch.numpy())
loss_train /= len(train_index_list)
accuracy_train /= len(train_index_list)
auc_train /= len(train_index_list)
# testing
loss_test = 0
y_test_list = []
y_pred_test_list = []
for batch_i , index in enumerate(test_index_list):
x_test_batch = {'input_ids': tf.gather_nd(X_test_text['input_ids'] , indices = index) ,
'attention_mask': tf.gather_nd(X_test_text['attention_mask'] , indices = index)}
y_test_batch = y_test[np.squeeze(np.array(index))]
loss_test_batch , y_pred_test_batch = test_step(x_test_batch , y_test_batch)
loss_test += loss_train_batch.numpy()
y_test_list.append(y_test_batch.astype(float))
y_pred_test_list.append(y_pred_test_batch.numpy())
loss_test /= len(test_index_list)
accuracy_test = accuracy_score(np.reshape(np.array(y_test_list) , [-1 , ]) , (np.reshape(np.array(y_pred_test_list), [-1 , ]) > 0.5).astype(np.float32))
auc_test = roc_auc_score(np.reshape(np.array(y_test_list) , [-1 , ]) , np.reshape(np.array(y_pred_test_list), [-1 , ]))
print('=' * 83)
print('epoch_i : {} , train_loss : {:.4f} , train_accuracy : {:.2%} , train_auc : {:.4f}'.format(epoch_i+1 , loss_train , accuracy_train , auc_train))
auc_acc_loss_history['train']['loss'][epoch_i + 1] = loss_train
auc_acc_loss_history['train']['acc'][epoch_i + 1] = accuracy_train
auc_acc_loss_history['train']['auc'][epoch_i + 1] = auc_train
print('epoch_i : {} , test_loss : {:.4f} , test_accuracy : {:.2%} , test_auc : {:.4f}'.format(epoch_i+1 , loss_test , accuracy_test , auc_test))
auc_acc_loss_history['test']['loss'][epoch_i + 1] = loss_test
auc_acc_loss_history['test']['acc'][epoch_i + 1] = accuracy_test
auc_acc_loss_history['test']['auc'][epoch_i + 1] = auc_test
bert_model.save_weights('bert_model/epoch_{}/'.format(epoch_i + 1))
bert_save = {'X_train_text' : X_train_text ,
'X_test_text' : X_test_text ,
'y_train' : y_train ,
'y_test' : y_test,
'auc_acc_loss_history' : auc_acc_loss_history}
with open('bert_save.pkl', 'wb') as f:
pickle.dump(bert_save , f)
=================================================================================== epoch_i : 1 , train_loss : 0.3473 , train_accuracy : 85.93% , train_auc : 0.8488 epoch_i : 1 , test_loss : 0.2440 , test_accuracy : 90.66% , test_auc : 0.9471 =================================================================================== epoch_i : 2 , train_loss : 0.1929 , train_accuracy : 92.22% , train_auc : 0.9618 epoch_i : 2 , test_loss : 0.1350 , test_accuracy : 90.51% , test_auc : 0.9513 =================================================================================== epoch_i : 3 , train_loss : 0.1332 , train_accuracy : 95.01% , train_auc : 0.9805 epoch_i : 3 , test_loss : 0.0752 , test_accuracy : 90.14% , test_auc : 0.9470 =================================================================================== epoch_i : 4 , train_loss : 0.0991 , train_accuracy : 96.52% , train_auc : 0.9912 epoch_i : 4 , test_loss : 0.0618 , test_accuracy : 90.64% , test_auc : 0.9377 =================================================================================== epoch_i : 5 , train_loss : 0.0729 , train_accuracy : 97.37% , train_auc : 0.9954 epoch_i : 5 , test_loss : 0.0565 , test_accuracy : 90.25% , test_auc : 0.9295 =================================================================================== epoch_i : 6 , train_loss : 0.0555 , train_accuracy : 98.13% , train_auc : 0.9972 epoch_i : 6 , test_loss : 0.0371 , test_accuracy : 90.88% , test_auc : 0.9446 =================================================================================== epoch_i : 7 , train_loss : 0.0332 , train_accuracy : 99.01% , train_auc : 0.9983 epoch_i : 7 , test_loss : 0.0427 , test_accuracy : 89.83% , test_auc : 0.9466 =================================================================================== epoch_i : 8 , train_loss : 0.0196 , train_accuracy : 99.54% , train_auc : 0.9992 epoch_i : 8 , test_loss : 0.0400 , test_accuracy : 91.12% , test_auc : 0.9413 =================================================================================== epoch_i : 9 , train_loss : 0.0098 , train_accuracy : 99.79% , train_auc : 0.9997 epoch_i : 9 , test_loss : 0.0325 , test_accuracy : 91.24% , test_auc : 0.9376 =================================================================================== epoch_i : 10 , train_loss : 0.0070 , train_accuracy : 99.82% , train_auc : 0.9999 epoch_i : 10 , test_loss : 0.0239 , test_accuracy : 91.14% , test_auc : 0.9323
with open('bert_save.pkl', 'rb') as f:
bert_save = pickle.load(f)
X_train_text , X_test_text , y_train , y_test = bert_save['X_train_text'] , bert_save['X_test_text'] , bert_save['y_train'] , bert_save['y_test']
auc_acc_loss_history = bert_save['auc_acc_loss_history']
del bert_save
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
bert_model.load_weights('bert_model/epoch_8/')
2023-11-27 17:39:37.481327: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-11-27 17:39:37.507196: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-11-27 17:39:37.507354: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-11-27 17:39:37.507741: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2023-11-27 17:39:37.508332: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-11-27 17:39:37.508466: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-11-27 17:39:37.508580: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-11-27 17:39:37.923289: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-11-27 17:39:37.923453: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-11-27 17:39:37.923568: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-11-27 17:39:37.923661: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9328 MB memory: -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5 All PyTorch model weights were used when initializing TFBertForSequenceClassification. Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7f72e0417040>
@tf.function
def inference(x , y):
bert_output = bert_model(x)
prediction = tf.nn.softmax(bert_output.logits , axis = -1)
y_pred = prediction[: , 1]
y_pred = tf.reshape(y_pred , [-1 , 1])
prediction = tf.math.log(tf.clip_by_value(prediction , 1e-8 , tf.reduce_max(prediction)))
y_onehot = tf.one_hot(tf.reshape(y , [-1 , ]) , depth = 2)
cross_entropy_temp = -tf.reduce_sum(y_onehot * prediction , axis = 1)
cross_entropy = tf.reduce_mean(cross_entropy_temp)
return cross_entropy , y_pred
def minibatch_index(minibatch_num , len_data , training = True):
total_index = list(range(0 , len_data))
if training == True:
random.shuffle(total_index)
step = (int(len_data / minibatch_num) + 1) * minibatch_num
temp = []
index = []
j = 0
for ii in range(0 , step):
j = j + 1
if j > len_data:
j = j - (len_data)
temp.append([total_index[j - 1]])
if len(temp) == minibatch_num:
index.append(temp)
temp = []
return index
train_index_list = minibatch_index(minibatch_num = 1 , len_data = len(y_train) , training = False)
test_index_list = minibatch_index(minibatch_num = 1 , len_data = len(y_test) , training = False)
y_train_list = []
y_pred_train_list = []
for batch_i , index in enumerate(train_index_list):
x_train_batch = {'input_ids': tf.gather_nd(X_train_text['input_ids'] , indices = index) ,
'attention_mask': tf.gather_nd(X_train_text['attention_mask'] , indices = index)}
y_train_batch = y_train[np.squeeze(np.array(index))]
_ , y_pred_train_batch = inference(x_train_batch , y_train_batch)
y_train_list.append(y_train_batch.astype(float))
y_pred_train_list.append(y_pred_train_batch.numpy())
# train probability
y_pred_train_list = np.reshape(np.array(y_pred_train_list), [-1 , ])
# train predicted label
y_pred_train_list_label = np.round(y_pred_train_list)
# list of train metrics scores
accuracy = accuracy_score(np.reshape(np.array(y_train_list) , [-1 , ]) , y_pred_train_list_label)
precision = precision_score(np.reshape(np.array(y_train_list) , [-1 , ]) , y_pred_train_list_label)
recall = recall_score(np.reshape(np.array(y_train_list) , [-1 , ]) , y_pred_train_list_label)
f1 = f1_score(np.reshape(np.array(y_train_list) , [-1 , ]) , y_pred_train_list_label)
AUC = roc_auc_score(np.reshape(np.array(y_train_list) , [-1 , ]) , y_pred_train_list)
train_metrics_list = [accuracy , precision , recall , f1 , AUC]
y_test_list = []
y_pred_test_list = []
for batch_i , index in enumerate(test_index_list):
x_test_batch = {'input_ids': tf.gather_nd(X_test_text['input_ids'] , indices = index) ,
'attention_mask': tf.gather_nd(X_test_text['attention_mask'] , indices = index)}
y_test_batch = y_test[np.squeeze(np.array(index))]
_ , y_pred_test_batch = inference(x_test_batch , y_test_batch)
y_test_list.append(y_test_batch.astype(float))
y_pred_test_list.append(y_pred_test_batch.numpy())
# test probability
y_pred_test_list = np.reshape(np.array(y_pred_test_list), [-1 , ])
# test predicted label
y_pred_test_list_label = np.round(y_pred_test_list)
# list of test metrics scores
accuracy = accuracy_score(np.reshape(np.array(y_test_list) , [-1 , ]) , y_pred_test_list_label)
precision = precision_score(np.reshape(np.array(y_test_list) , [-1 , ]) , y_pred_test_list_label)
recall = recall_score(np.reshape(np.array(y_test_list) , [-1 , ]) , y_pred_test_list_label)
f1 = f1_score(np.reshape(np.array(y_test_list) , [-1 , ]) , y_pred_test_list_label)
AUC = roc_auc_score(np.reshape(np.array(y_test_list) , [-1 , ]) , y_pred_test_list)
test_metrics_list = [accuracy , precision , recall , f1 , AUC]
metrics_name = ['Accuracy' , 'Precision' , 'Recall' , 'F1' , 'AUC']
fig = go.Figure(data = [go.Bar(name = 'Train Metrics Scores',
x = metrics_name ,
y = train_metrics_list ,
text = np.round(train_metrics_list , 4) ,
marker_color = '#67001f'),
go.Bar(name = 'Test Metrics Scores',
x = metrics_name ,
y = test_metrics_list,
text = np.round(test_metrics_list , 4) ,
marker_color = '#fddbc7')])
fig.update_traces(width = 0.4 , marker = dict(line = dict(color = '#000000' , width = 2)))
fig.update_layout(xaxis_title = 'Metrics',
yaxis_title = 'Metrics scors',
font = dict(size = 17 , family = 'Franklin Gothic') ,
template = 'simple_white' ,
title = 'Train Metrics Scores & Test Metrics Scores')
fig.show()